{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install elasticsearch pandas\n",
    "\n",
    "from elasticsearch import Elasticsearch, helpers\n",
    "from elasticsearch.helpers import BulkIndexError\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Elasticsearch Client"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the Elasticsearch client\n",
    "es = Elasticsearch(\n",
    "    ['HOST:PORT'],\n",
    "    basic_auth=('USERNMAME', 'PASSWORD'),\n",
    "    verify_certs=False\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parsing dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define your index\n",
    "index_name = 'recipes'\n",
    "\n",
    "# Define the mapping\n",
    "mapping = {\n",
    "  \"mappings\": {\n",
    "    \"properties\": {\n",
    "      \"group\": { \"type\": \"text\" },\n",
    "      \"name\": { \"type\": \"text\" },\n",
    "      \"rating\": { \"type\": \"text\" },\n",
    "      \"n_rater\": { \"type\": \"text\" },\n",
    "      \"n_reviewer\": { \"type\": \"text\" },\n",
    "      \"summary\": {\n",
    "        \"type\": \"text\",\n",
    "        \"analyzer\": \"english\"\n",
    "      },\n",
    "      \"process\": { \"type\": \"text\" },\n",
    "      \"ingredient\": {\n",
    "        \"type\": \"text\",\n",
    "      },\n",
    "      \"ml.tokens\": {\n",
    "        \"type\": \"rank_features\"\n",
    "      }\n",
    "    }\n",
    "  }\n",
    "}\n",
    "\n",
    "# Create index\n",
    "es.indices.create(index=index_name, body=mapping)\n",
    "\n",
    "# Read CSV file with pandas\n",
    "with open('recipe_dataset.csv', 'r', encoding='utf-8', errors='ignore') as file:\n",
    "    df = pd.read_csv(file)\n",
    "\n",
    "# Convert DataFrame to a list of dictionaries for indexing\n",
    "recipes = df.to_dict('records')\n",
    "print(f\"Number of documents: {len(recipes)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Bulk Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate the sequence of JSON documents for a bulk index operation\n",
    "bulk_index_body = []\n",
    "for index, recipe in enumerate(recipes):\n",
    "    document = {\n",
    "        \"_index\": \"recipes\",\n",
    "        \"pipeline\": \"elser-v1-recipes\",\n",
    "        \"_source\": recipe\n",
    "    }\n",
    "    bulk_index_body.append(document)\n",
    "\n",
    "# Bulk index the data and handle BulkIndexError\n",
    "try:\n",
    "    response = helpers.bulk(es, bulk_index_body, chunk_size=500, request_timeout=60*3)\n",
    "    print (\"\\nRESPONSE:\", response)\n",
    "except BulkIndexError as e:\n",
    "    for error in e.errors:\n",
    "        print(f\"Document ID: {error['index']['_id']}\")\n",
    "        print(f\"Error Type: {error['index']['error']['type']}\")\n",
    "        print(f\"Error Reason: {error['index']['error']['reason']}\")"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
